import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# loading dataset
df = pd.read_excel('data.xlsx')
df.head()
we wouldn't be using all of the features present here in our data set for our analysis, so we pick only those needed.
df_copy = df.copy()
df = df[['Age','Nationality', 'Club','Release Clause','Wage','Value','Preferred Foot','Position','Weight','Finishing',
'Dribbling','BallControl','Stamina','Jumping','SlidingTackle','GKReflexes','Body Type']]
df
df.info()
# checjing for missing values
df.isnull().sum()
# removing rows with missing values from our dataset
df.dropna(inplace=True)
df.isnull().sum()
# remove lbs in weight column
df['Weight'] = df['Weight'].str.replace('lbs','')
# change weight column from object to int
df['Weight'] = df['Weight'].astype(int)
# checking if the changes has been implemented
df['Weight']
From observation, value, wage and release clause column represents thounsand with 'K' and million with 'M'. We can replace k with '000' and M with '000000' and also remove point(.) and change their data types to int.
# removing decimal point
df['Value'] = df['Value'].str.replace('.','')
df['Release Clause'] = df['Release Clause'].str.replace('.','')
# replace K with '000' and M with '000000'
df['Value'] = df['Value'].str.replace('K','000')
df['Value'] = df['Value'].str.replace('M', '00000')
df['Release Clause'] = df['Release Clause'].str.replace('K','000')
df['Release Clause'] = df['Release Clause'].str.replace('M', '00000')
df['Wage'] = df['Wage'].str.replace('K','000')
df['Value']
# remove the currency symbol '€' from the dataset
df['Value'] = df['Value'].str.replace('€','')
df['Release Clause'] = df['Release Clause'].str.replace('€','')
df['Wage'] = df['Wage'].str.replace('€','')
df.head(3)
# change data types
df['Value'] = df['Value'].astype(int)
df['Wage'] = df['Wage'].astype(int)
df['Release Clause'] = df['Release Clause'].astype(int)
df.describe(include = 'all')
df.to_csv('fifa_19_cleaned.csv', index=False)
df['Age'].hist();
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Age distribution of players');
sns.distplot(df['Weight']);
plt.ylabel('Frequency')
plt.title('Weight distributions of players');
def n_bar_plot(dataFrame, col, a, b, title, x_label, y_label ):
dataFrame[col].value_counts()[a:b].plot(kind='bar');
plt.title(title);
plt.xlabel(x_label)
plt.ylabel(y_label);
n_bar_plot(df, 'Club', 0, 10, 'Most popular clubs', 'Club', 'count' )
with the findings above, lets take a look at players nationality
n_bar_plot(df, 'Nationality', 0, 10, 'Most Players Nationality', 'Nationality', 'Frequency')
def bar_plot(dataFrame, col, title, x_label, y_label):
dataFrame[col].value_counts().plot(kind='bar');
plt.title(title)
plt.xlabel(x_label)
plt.ylabel(y_label);
bar_plot(df, 'Preferred Foot', 'Players Most Preferred Foot', 'Preferred Foot', 'count')
We have gotten insight on players preferred foot, lets take a look at players position to see where most players are situated on the pitch
plt.figure(figsize=(8,4))
bar_plot(df, 'Position', 'Players positioning', 'Position', 'count')
Before making my visualizations, There were some missing observations which i cleaned in order to avoid errors and be able to make good analysis. Also, some of the column data types were changed to help get a perfect data for the job.
lets look at the correlation between Weight and Finishing
df.plot(x='Weight', y='Finishing', kind='scatter', title='Weight by finishing');
df.plot(y='BallControl', x='Finishing', kind='scatter', title='finishing and Ball control');
df.plot(y='Dribbling', x='Finishing', kind='scatter', title='correlation between Dribbling and Finishing');
Lets look at the value of player based on body type
base_color = sns.color_palette()[0]
sns.violinplot(x='Body Type', y='Value', color = base_color, data=df);
plt.title('Value of player by Body type');
club_by_wage =df.groupby('Club')['Wage'].sum().sort_values(ascending=False).head(10)
club_by_wage.plot.bar(title='Top 10 club with most player wage')
plt.ylabel('Amount');
sns.catplot(x='Preferred Foot',y='Value', kind='bar', data=df);
plt.title('Most Preferred foot of players by Value');
sns.pairplot(df);
g = sns.FacetGrid(data = df, hue = 'Body Type', height = 8, aspect = 1.5, palette = 'viridis_r')
g.map(plt.scatter, 'Weight', 'Age');
g.add_legend();
plt.title('Age vs Weight and Body_type of players');
sns.catplot(x='Body Type',y='Value', hue='Preferred Foot', kind='bar', data=df);
plt.title('Body_type vs Value and Preferred_foot');